// Copyright 2011 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef SRC_STRING_SEARCH_H_
#define SRC_STRING_SEARCH_H_

#if defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS

#include "node.h"
#include <string.h>

namespace node {
namespace stringsearch {

    // Returns the maximum of the two parameters.
    template <typename T>
    T Max(T a, T b)
    {
        return a < b ? b : a;
    }

    static const uint32_t kMaxOneByteCharCodeU = 0xff;

    template <typename T>
    class Vector {
    public:
        Vector(T* data, size_t length, bool isForward)
            : start_(data)
            , length_(length)
            , is_forward_(isForward)
        {
            NODE_ASSERT(length > 0 && data != nullptr);
        }

        // Returns the start of the memory range.
        // For vector v this is NOT necessarily &v[0], see forward().
        const T* start() const { return start_; }

        // Returns the length of the vector, in characters.
        size_t length() const { return length_; }

        // Returns true if the Vector is front-to-back, false if back-to-front.
        // In the latter case, v[0] corresponds to the *end* of the memory range.
        size_t forward() const { return is_forward_; }

        // Access individual vector elements - checks bounds in debug mode.
        T& operator[](size_t index) const
        {
            NODE_ASSERT(index < length_);
            return start_[is_forward_ ? index : (length_ - index - 1)];
        }

    private:
        T* start_;
        size_t length_;
        bool is_forward_;
    };

    //---------------------------------------------------------------------
    // String Search object.
    //---------------------------------------------------------------------

    // Class holding constants and methods that apply to all string search variants,
    // independently of subject and pattern char size.
    class StringSearchBase {
    protected:
        // Cap on the maximal shift in the Boyer-Moore implementation. By setting a
        // limit, we can fix the size of tables. For a needle longer than this limit,
        // search will not be optimal, since we only build tables for a suffix
        // of the string, but it is a safe approximation.
        static const int kBMMaxShift = 250;

        // Reduce alphabet to this size.
        // One of the tables used by Boyer-Moore and Boyer-Moore-Horspool has size
        // proportional to the input alphabet. We reduce the alphabet size by
        // equating input characters modulo a smaller alphabet size. This gives
        // a potentially less efficient searching, but is a safe approximation.
        // For needles using only characters in the same Unicode 256-code point page,
        // there is no search speed degradation.
        static const int kLatin1AlphabetSize = 256;
        static const int kUC16AlphabetSize = 256;

        // Bad-char shift table stored in the state. It's length is the alphabet size.
        // For patterns below this length, the skip length of Boyer-Moore is too short
        // to compensate for the algorithmic overhead compared to simple brute force.
        static const int kBMMinPatternLength = 8;

        // Store for the BoyerMoore(Horspool) bad char shift table.
        static int kBadCharShiftTable[kUC16AlphabetSize];
        // Store for the BoyerMoore good suffix shift table.
        static int kGoodSuffixShiftTable[kBMMaxShift + 1];
        // Table used temporarily while building the BoyerMoore good suffix
        // shift table.
        static int kSuffixTable[kBMMaxShift + 1];
    };

    template <typename Char>
    class StringSearch : private StringSearchBase {
    public:
        explicit StringSearch(Vector<const Char> pattern)
            : pattern_(pattern)
            , start_(0)
        {
            if (pattern.length() >= kBMMaxShift) {
                start_ = pattern.length() - kBMMaxShift;
            }

            size_t pattern_length = pattern_.length();
            NODE_CHECK_GT(pattern_length, 0);
            if (pattern_length < kBMMinPatternLength) {
                if (pattern_length == 1) {
                    strategy_ = &SingleCharSearch;
                    return;
                }
                strategy_ = &LinearSearch;
                return;
            }
            strategy_ = &InitialSearch;
        }

        size_t Search(Vector<const Char> subject, size_t index)
        {
            return strategy_(this, subject, index);
        }

        static inline int AlphabetSize()
        {
            if (sizeof(Char) == 1) {
                // Latin1 needle.
                return kLatin1AlphabetSize;
            } else {
                // UC16 needle.
                return kUC16AlphabetSize;
            }

            static_assert(sizeof(Char) == sizeof(uint8_t) || sizeof(Char) == sizeof(uint16_t),
                "sizeof(Char) == sizeof(uint16_t) || sizeof(uint8_t)");
        }

    private:
        typedef size_t (*SearchFunction)(
            StringSearch<Char>*,
            Vector<const Char>,
            size_t);

        static size_t SingleCharSearch(StringSearch<Char>* search,
            Vector<const Char> subject,
            size_t start_index);

        static size_t LinearSearch(StringSearch<Char>* search,
            Vector<const Char> subject,
            size_t start_index);

        static size_t InitialSearch(StringSearch<Char>* search,
            Vector<const Char> subject,
            size_t start_index);

        static size_t BoyerMooreHorspoolSearch(
            StringSearch<Char>* search,
            Vector<const Char> subject,
            size_t start_index);

        static size_t BoyerMooreSearch(StringSearch<Char>* search,
            Vector<const Char> subject,
            size_t start_index);

        void PopulateBoyerMooreHorspoolTable();

        void PopulateBoyerMooreTable();

        static inline int CharOccurrence(int* bad_char_occurrence,
            Char char_code)
        {
            if (sizeof(Char) == 1) {
                return bad_char_occurrence[static_cast<int>(char_code)];
            }
            // Both pattern and subject are UC16. Reduce character to equivalence class.
            int equiv_class = char_code % kUC16AlphabetSize;
            return bad_char_occurrence[equiv_class];
        }

        // Store for the BoyerMoore(Horspool) bad char shift table.
        // Return a table covering the last kBMMaxShift+1 positions of
        // pattern.
        int* bad_char_table() { return kBadCharShiftTable; }

        // Store for the BoyerMoore good suffix shift table.
        int* good_suffix_shift_table()
        {
            // Return biased pointer that maps the range  [start_..pattern_.length()
            // to the kGoodSuffixShiftTable array.
            return kGoodSuffixShiftTable - start_;
        }

        // Table used temporarily while building the BoyerMoore good suffix
        // shift table.
        int* suffix_table()
        {
            // Return biased pointer that maps the range  [start_..pattern_.length()
            // to the kSuffixTable array.
            return kSuffixTable - start_;
        }

        // The pattern to search for.
        Vector<const Char> pattern_;
        // Pointer to implementation of the search.
        SearchFunction strategy_;
        // Cache value of Max(0, pattern_length() - kBMMaxShift)
        size_t start_;
    };

    template <typename T, typename U>
    inline T AlignDown(T value, U alignment)
    {
        return reinterpret_cast<T>(
            (reinterpret_cast<uintptr_t>(value) & ~(alignment - 1)));
    }

    inline uint8_t GetHighestValueByte(uint16_t character)
    {
        return Max(static_cast<uint8_t>(character & 0xFF),
            static_cast<uint8_t>(character >> 8));
    }

    inline uint8_t GetHighestValueByte(uint8_t character) { return character; }

    // Searches for a byte value in a memory buffer, back to front.
    // Uses memrchr(3) on systems which support it, for speed.
    // Falls back to a vanilla for loop on non-GNU systems such as Windows.
    inline const void* MemrchrFill(const void* haystack, uint8_t needle,
        size_t haystack_len)
    {
#ifdef _GNU_SOURCE
        return memrchr(haystack, needle, haystack_len);
#else
        const uint8_t* haystack8 = static_cast<const uint8_t*>(haystack);
        for (size_t i = haystack_len - 1; i != static_cast<size_t>(-1); i--) {
            if (haystack8[i] == needle) {
                return haystack8 + i;
            }
        }
        return nullptr;
#endif
    }

    // Finds the first occurence of *two-byte* character pattern[0] in the string
    // `subject`. Does not check that the whole pattern matches.
    template <typename Char>
    inline size_t FindFirstCharacter(Vector<const Char> pattern,
        Vector<const Char> subject, size_t index)
    {
        const Char pattern_first_char = pattern[0];
        const size_t max_n = (subject.length() - pattern.length() + 1);

        // For speed, search for the more `rare` of the two bytes in pattern[0]
        // using memchr / memrchr (which are much faster than a simple for loop).
        const uint8_t search_byte = GetHighestValueByte(pattern_first_char);
        size_t pos = index;
        do {
            const size_t bytes_to_search = (max_n - pos) * sizeof(Char);
            const void* void_pos;
            if (subject.forward()) {
                // NODE_ASSERT that bytes_to_search won't overflow
                NODE_CHECK_LE(pos, max_n);
                NODE_CHECK_LE(max_n - pos, SIZE_MAX / sizeof(Char));
                void_pos = memchr(subject.start() + pos, search_byte, bytes_to_search);
            } else {
                NODE_CHECK_LE(pos, subject.length());
                NODE_CHECK_LE(subject.length() - pos, SIZE_MAX / sizeof(Char));
                void_pos = MemrchrFill(subject.start() + pattern.length() - 1,
                    search_byte,
                    bytes_to_search);
            }
            const Char* char_pos = static_cast<const Char*>(void_pos);
            if (char_pos == nullptr)
                return subject.length();

            // Then, for each match, verify that the full two bytes match pattern[0].
            char_pos = AlignDown(char_pos, sizeof(Char));
            size_t raw_pos = static_cast<size_t>(char_pos - subject.start());
            pos = subject.forward() ? raw_pos : (subject.length() - raw_pos - 1);
            if (subject[pos] == pattern_first_char) {
                // Match found, hooray.
                return pos;
            }
            // Search byte matched, but the other byte of pattern[0] didn't. Keep going.
        } while (++pos < max_n);

        return subject.length();
    }

    // Finds the first occurance of the byte pattern[0] in string `subject`.
    // Does not verify that the whole pattern matches.
    template <>
    inline size_t FindFirstCharacter(Vector<const uint8_t> pattern,
        Vector<const uint8_t> subject,
        size_t index)
    {
        const uint8_t pattern_first_char = pattern[0];
        const size_t subj_len = subject.length();
        const size_t max_n = (subject.length() - pattern.length() + 1);

        const void* pos;
        if (subject.forward()) {
            pos = memchr(subject.start() + index, pattern_first_char, max_n - index);
        } else {
            pos = MemrchrFill(subject.start() + pattern.length() - 1,
                pattern_first_char,
                max_n - index);
        }
        const uint8_t* char_pos = static_cast<const uint8_t*>(pos);
        if (char_pos == nullptr) {
            return subj_len;
        }

        size_t raw_pos = static_cast<size_t>(char_pos - subject.start());
        return subject.forward() ? raw_pos : (subj_len - raw_pos - 1);
    }

    //---------------------------------------------------------------------
    // Single Character Pattern Search Strategy
    //---------------------------------------------------------------------

    template <typename Char>
    size_t StringSearch<Char>::SingleCharSearch(
        StringSearch<Char>* search,
        Vector<const Char> subject,
        size_t index)
    {
        NODE_CHECK_EQ(1, search->pattern_.length());
        return FindFirstCharacter(search->pattern_, subject, index);
    }

    //---------------------------------------------------------------------
    // Linear Search Strategy
    //---------------------------------------------------------------------

    // Simple linear search for short patterns. Never bails out.
    template <typename Char>
    size_t StringSearch<Char>::LinearSearch(
        StringSearch<Char>* search,
        Vector<const Char> subject,
        size_t index)
    {
        Vector<const Char> pattern = search->pattern_;
        NODE_CHECK_GT(pattern.length(), 1);
        const size_t pattern_length = pattern.length();
        const size_t n = subject.length() - pattern_length;
        for (size_t i = index; i <= n; i++) {
            i = FindFirstCharacter(pattern, subject, i);
            if (i == subject.length())
                return subject.length();
            NODE_ASSERT_LE(i, n);

            bool matches = true;
            for (size_t j = 1; j < pattern_length; j++) {
                if (pattern[j] != subject[i + j]) {
                    matches = false;
                    break;
                }
            }
            if (matches) {
                return i;
            }
        }
        return subject.length();
    }

    //---------------------------------------------------------------------
    // Boyer-Moore string search
    //---------------------------------------------------------------------

    template <typename Char>
    size_t StringSearch<Char>::BoyerMooreSearch(
        StringSearch<Char>* search,
        Vector<const Char> subject,
        size_t start_index)
    {
        Vector<const Char> pattern = search->pattern_;
        const size_t subject_length = subject.length();
        const size_t pattern_length = pattern.length();
        // Only preprocess at most kBMMaxShift last characters of pattern.
        size_t start = search->start_;

        int* bad_char_occurence = search->bad_char_table();
        int* good_suffix_shift = search->good_suffix_shift_table();

        Char last_char = pattern[pattern_length - 1];
        size_t index = start_index;
        // Continue search from i.
        while (index <= subject_length - pattern_length) {
            size_t j = pattern_length - 1;
            int c;
            while (last_char != (c = subject[index + j])) {
                int shift = j - CharOccurrence(bad_char_occurence, c);
                index += shift;
                if (index > subject_length - pattern_length) {
                    return subject.length();
                }
            }
            while (pattern[j] == (c = subject[index + j])) {
                if (j == 0) {
                    return index;
                }
                j--;
            }
            if (j < start) {
                // we have matched more than our tables allow us to be smart about.
                // Fall back on BMH shift.
                index += pattern_length - 1 - CharOccurrence(bad_char_occurence, static_cast<Char>(last_char));
            } else {
                int gs_shift = good_suffix_shift[j + 1];
                int bc_occ = CharOccurrence(bad_char_occurence, c);
                int shift = j - bc_occ;
                if (gs_shift > shift) {
                    shift = gs_shift;
                }
                index += shift;
            }
        }

        return subject.length();
    }

    template <typename Char>
    void StringSearch<Char>::PopulateBoyerMooreTable()
    {
        const size_t pattern_length = pattern_.length();
        Vector<const Char> pattern = pattern_;
        // Only look at the last kBMMaxShift characters of pattern (from start_
        // to pattern_length).
        const size_t start = start_;
        const size_t length = pattern_length - start;

        // Biased tables so that we can use pattern indices as table indices,
        // even if we only cover the part of the pattern from offset start.
        int* shift_table = good_suffix_shift_table();
        int* suffix_table = this->suffix_table();

        // Initialize table.
        for (size_t i = start; i < pattern_length; i++) {
            shift_table[i] = length;
        }
        shift_table[pattern_length] = 1;
        suffix_table[pattern_length] = pattern_length + 1;

        if (pattern_length <= start) {
            return;
        }

        // Find suffixes.
        Char last_char = pattern_[pattern_length - 1];
        size_t suffix = pattern_length + 1;
        {
            size_t i = pattern_length;
            while (i > start) {
                Char c = pattern[i - 1];
                while (suffix <= pattern_length && c != pattern[suffix - 1]) {
                    if (static_cast<size_t>(shift_table[suffix]) == length) {
                        shift_table[suffix] = suffix - i;
                    }
                    suffix = suffix_table[suffix];
                }
                suffix_table[--i] = --suffix;
                if (suffix == pattern_length) {
                    // No suffix to extend, so we check against last_char only.
                    while ((i > start) && (pattern[i - 1] != last_char)) {
                        if (static_cast<size_t>(shift_table[pattern_length]) == length) {
                            shift_table[pattern_length] = pattern_length - i;
                        }
                        suffix_table[--i] = pattern_length;
                    }
                    if (i > start) {
                        suffix_table[--i] = --suffix;
                    }
                }
            }
        }
        // Build shift table using suffixes.
        if (suffix < pattern_length) {
            for (size_t i = start; i <= pattern_length; i++) {
                if (static_cast<size_t>(shift_table[i]) == length) {
                    shift_table[i] = suffix - start;
                }
                if (i == suffix) {
                    suffix = suffix_table[suffix];
                }
            }
        }
    }

    //---------------------------------------------------------------------
    // Boyer-Moore-Horspool string search.
    //---------------------------------------------------------------------

    template <typename Char>
    size_t StringSearch<Char>::BoyerMooreHorspoolSearch(
        StringSearch<Char>* search,
        Vector<const Char> subject,
        size_t start_index)
    {
        Vector<const Char> pattern = search->pattern_;
        const size_t subject_length = subject.length();
        const int64_t pattern_length = pattern.length();
        int* char_occurrences = search->bad_char_table();
        int64_t badness = -pattern_length;

        // How bad we are doing without a good-suffix table.
        Char last_char = pattern[pattern_length - 1];
        int last_char_shift = pattern_length - 1 - CharOccurrence(char_occurrences, static_cast<Char>(last_char));

        // Perform search
        size_t index = start_index; // No matches found prior to this index.
        while (index <= subject_length - pattern_length) {
            size_t j = pattern_length - 1;
            int subject_char;
            while (last_char != (subject_char = subject[index + j])) {
                int bc_occ = CharOccurrence(char_occurrences, subject_char);
                int shift = j - bc_occ;
                index += shift;
                badness += 1 - shift; // at most zero, so badness cannot increase.
                if (index > subject_length - pattern_length) {
                    return subject_length;
                }
            }
            j--;
            while (pattern[j] == (subject[index + j])) {
                if (j == 0) {
                    return index;
                }
                j--;
            }
            index += last_char_shift;
            // Badness increases by the number of characters we have
            // checked, and decreases by the number of characters we
            // can skip by shifting. It's a measure of how we are doing
            // compared to reading each character exactly once.
            badness += (pattern_length - j) - last_char_shift;
            if (badness > 0) {
                search->PopulateBoyerMooreTable();
                search->strategy_ = &BoyerMooreSearch;
                return BoyerMooreSearch(search, subject, index);
            }
        }
        return subject.length();
    }

    template <typename Char>
    void StringSearch<Char>::PopulateBoyerMooreHorspoolTable()
    {
        const size_t pattern_length = pattern_.length();

        int* bad_char_occurrence = bad_char_table();

        // Only preprocess at most kBMMaxShift last characters of pattern.
        const size_t start = start_;
        // Run forwards to populate bad_char_table, so that *last* instance
        // of character equivalence class is the one registered.
        // Notice: Doesn't include the last character.
        const size_t table_size = AlphabetSize();
        if (start == 0) {
            // All patterns less than kBMMaxShift in length.
            memset(bad_char_occurrence, -1, table_size * sizeof(*bad_char_occurrence));
        } else {
            for (size_t i = 0; i < table_size; i++) {
                bad_char_occurrence[i] = start - 1;
            }
        }
        for (size_t i = start; i < pattern_length - 1; i++) {
            Char c = pattern_[i];
            int bucket = (sizeof(Char) == 1) ? c : c % AlphabetSize();
            bad_char_occurrence[bucket] = i;
        }
    }

    //---------------------------------------------------------------------
    // Linear string search with bailout to BMH.
    //---------------------------------------------------------------------

    // Simple linear search for short patterns, which bails out if the string
    // isn't found very early in the subject. Upgrades to BoyerMooreHorspool.
    template <typename Char>
    size_t StringSearch<Char>::InitialSearch(
        StringSearch<Char>* search,
        Vector<const Char> subject,
        size_t index)
    {
        Vector<const Char> pattern = search->pattern_;
        const size_t pattern_length = pattern.length();
        // Badness is a count of how much work we have done.  When we have
        // done enough work we decide it's probably worth switching to a better
        // algorithm.
        int64_t badness = -10 - (pattern_length << 2);

        // We know our pattern is at least 2 characters, we cache the first so
        // the common case of the first character not matching is faster.
        for (size_t i = index, n = subject.length() - pattern_length; i <= n; i++) {
            badness++;
            if (badness <= 0) {
                i = FindFirstCharacter(pattern, subject, i);
                if (i == subject.length())
                    return subject.length();
                NODE_ASSERT_LE(i, n);
                size_t j = 1;
                do {
                    if (pattern[j] != subject[i + j]) {
                        break;
                    }
                    j++;
                } while (j < pattern_length);
                if (j == pattern_length) {
                    return i;
                }
                badness += j;
            } else {
                search->PopulateBoyerMooreHorspoolTable();
                search->strategy_ = &BoyerMooreHorspoolSearch;
                return BoyerMooreHorspoolSearch(search, subject, i);
            }
        }
        return subject.length();
    }

    // Perform a a single stand-alone search.
    // If searching multiple times for the same pattern, a search
    // object should be constructed once and the Search function then called
    // for each search.
    template <typename Char>
    size_t SearchString(Vector<const Char> subject,
        Vector<const Char> pattern,
        size_t start_index)
    {
        StringSearch<Char> search(pattern);
        return search.Search(subject, start_index);
    }
} // namespace stringsearch
} // namespace node

namespace node {
using node::stringsearch::Vector;

template <typename Char>
size_t SearchString(const Char* haystack,
    size_t haystack_length,
    const Char* needle,
    size_t needle_length,
    size_t start_index,
    bool is_forward)
{
    // To do a reverse search (lastIndexOf instead of indexOf) without redundant
    // code, create two vectors that are reversed views into the input strings.
    // For example, v_needle[0] would return the *last* character of the needle.
    // So we're searching for the first instance of rev(needle) in rev(haystack)
    Vector<const Char> v_needle = Vector<const Char>(
        needle, needle_length, is_forward);
    Vector<const Char> v_haystack = Vector<const Char>(
        haystack, haystack_length, is_forward);
    NODE_ASSERT(haystack_length >= needle_length);
    size_t diff = haystack_length - needle_length;
    size_t relative_start_index;
    if (is_forward) {
        relative_start_index = start_index;
    } else if (diff < start_index) {
        relative_start_index = 0;
    } else {
        relative_start_index = diff - start_index;
    }
    size_t pos = node::stringsearch::SearchString(
        v_haystack, v_needle, relative_start_index);
    if (pos == haystack_length) {
        // not found
        return pos;
    }
    return is_forward ? pos : (haystack_length - needle_length - pos);
}
} // namespace node

#endif // defined(NODE_WANT_INTERNALS) && NODE_WANT_INTERNALS

#endif // SRC_STRING_SEARCH_H_
